별점분포의 mean, var, skewness 구하기 - expect_df

  • $\mu = E[X] = \sum x \cdot p(x)$
  • $\sigma^2 = E[(X - \mu)^2] = \sum (x - \mu)^2 p(x)$
  • $\frac{\mu_3}{\sigma^3} = \operatorname{E}\left[\left(\frac{X-\mu}{\sigma}\right)^3 \right]$

In [1]:
df = pd.read_csv('../resource/preprocess_dist_df.csv')
col = df.columns.astype(float).values
df.head()


Out[1]:
0.5 1 1.5 2 2.5 3 3.5 4 4.5 5
0 7 10 14 83 50 1472 454 4509 4318 2108
1 1312 2238 2150 6749 6597 9397 16842 1367 9011 2459
2 228 316 956 1513 2367 7526 9953 7289 21200 14948
3 3615 4063 5424 8133 11525 6501 17566 765 7099 2340
4 787 1612 1329 6635 5251 13675 15493 4620 14473 4299

별점분포의 X_count


In [2]:
df['sum'] = df.sum(axis=1)
df.head()


Out[2]:
0.5 1 1.5 2 2.5 3 3.5 4 4.5 5 sum
0 7 10 14 83 50 1472 454 4509 4318 2108 13025
1 1312 2238 2150 6749 6597 9397 16842 1367 9011 2459 58122
2 228 316 956 1513 2367 7526 9953 7289 21200 14948 66296
3 3615 4063 5424 8133 11525 6501 17566 765 7099 2340 67031
4 787 1612 1329 6635 5251 13675 15493 4620 14473 4299 68174

$p(X)$


In [3]:
1312 / 58122


Out[3]:
0.022573208079556796

In [4]:
# p(X)
prob_df = df.ix[:,:'5'].apply(lambda x: x / df['sum'])
prob_df.head()


Out[4]:
0.5 1 1.5 2 2.5 3 3.5 4 4.5 5
0 0.000537 0.000768 0.001075 0.006372 0.003839 0.113013 0.034856 0.346180 0.331516 0.161843
1 0.022573 0.038505 0.036991 0.116118 0.113503 0.161677 0.289770 0.023519 0.155036 0.042308
2 0.003439 0.004767 0.014420 0.022822 0.035704 0.113521 0.150130 0.109946 0.319778 0.225474
3 0.053930 0.060614 0.080918 0.121332 0.171935 0.096985 0.262058 0.011413 0.105906 0.034909
4 0.011544 0.023645 0.019494 0.097324 0.077023 0.200590 0.227257 0.067768 0.212295 0.063059

$ x \cdot p(x)$


In [5]:
col


Out[5]:
array([ 0.5,  1. ,  1.5,  2. ,  2.5,  3. ,  3.5,  4. ,  4.5,  5. ])

In [6]:
# X*p(X)
Xprob_df = prob_df.mul(col)
Xprob_df.head()


Out[6]:
0.5 1 1.5 2 2.5 3 3.5 4 4.5 5
0 0.000269 0.000768 0.001612 0.012745 0.009597 0.339040 0.121996 1.384722 1.491823 0.809213
1 0.011287 0.038505 0.055487 0.232236 0.283757 0.485031 1.014194 0.094078 0.697662 0.211538
2 0.001720 0.004767 0.021630 0.045644 0.089259 0.340564 0.525454 0.439785 1.439001 1.127368
3 0.026965 0.060614 0.121377 0.242664 0.429838 0.290955 0.917202 0.045651 0.476578 0.174546
4 0.005772 0.023645 0.029241 0.194649 0.192559 0.601769 0.795399 0.271071 0.955328 0.315296

mean : $\mu = E[X] = \sum x \cdot p(x)$


In [7]:
mean = Xprob_df.sum(axis=1)
mean.head()


Out[7]:
0    4.171785
1    3.123774
2    4.035191
3    2.786390
4    3.384729
dtype: float64

In [8]:
prob_df['mean'] = Xprob_df.sum(axis=1)
prob_df.head()


Out[8]:
0.5 1 1.5 2 2.5 3 3.5 4 4.5 5 mean
0 0.000537 0.000768 0.001075 0.006372 0.003839 0.113013 0.034856 0.346180 0.331516 0.161843 4.171785
1 0.022573 0.038505 0.036991 0.116118 0.113503 0.161677 0.289770 0.023519 0.155036 0.042308 3.123774
2 0.003439 0.004767 0.014420 0.022822 0.035704 0.113521 0.150130 0.109946 0.319778 0.225474 4.035191
3 0.053930 0.060614 0.080918 0.121332 0.171935 0.096985 0.262058 0.011413 0.105906 0.034909 2.786390
4 0.011544 0.023645 0.019494 0.097324 0.077023 0.200590 0.227257 0.067768 0.212295 0.063059 3.384729

std : $ \sigma = {\sqrt \sigma^2},(\sigma^2 = E[(X - \mu)^2] = \sum (x - \mu)^2 p(x)) $


In [9]:
# (X - u)
sub_df = pd.DataFrame(columns = col)
for index, row in prob_df.iterrows():
    sub = col - row['mean']
    sub_df.loc[len(sub_df)] = sub
sub_df.head()


Out[9]:
0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0
0 -3.671785 -3.171785 -2.671785 -2.171785 -1.671785 -1.171785 -0.671785 -0.171785 0.328215 0.828215
1 -2.623774 -2.123774 -1.623774 -1.123774 -0.623774 -0.123774 0.376226 0.876226 1.376226 1.876226
2 -3.535191 -3.035191 -2.535191 -2.035191 -1.535191 -1.035191 -0.535191 -0.035191 0.464809 0.964809
3 -2.286390 -1.786390 -1.286390 -0.786390 -0.286390 0.213610 0.713610 1.213610 1.713610 2.213610
4 -2.884729 -2.384729 -1.884729 -1.384729 -0.884729 -0.384729 0.115271 0.615271 1.115271 1.615271

In [10]:
# (X - u)^2
sub2_df = sub_df.applymap(lambda x: x*x)
sub2_df.head()


Out[10]:
0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0
0 13.482005 10.060220 7.138435 4.716650 2.794865 1.373080 0.451295 0.029510 0.107725 0.685940
1 6.884191 4.510417 2.636642 1.262868 0.389094 0.015320 0.141546 0.767772 1.893998 3.520224
2 12.497573 9.212382 6.427192 4.142001 2.356810 1.071620 0.286429 0.001238 0.216048 0.930857
3 5.227579 3.191189 1.654799 0.618409 0.082019 0.045629 0.509239 1.472850 2.936460 4.900070
4 8.321660 5.686931 3.552203 1.917474 0.782745 0.148016 0.013287 0.378559 1.243830 2.609101

In [11]:
13.482005*0.000537


Out[11]:
0.007239836685

In [12]:
# var(X) = (X - u)^2 * p(X)
var = sub2_df.mul(prob_df.ix[:,:'5']).sum(axis=1)
var.head()


Out[12]:
0    0.391277
1    1.121530
2    0.802008
3    1.335124
4    1.033640
dtype: float64

In [13]:
np.sqrt(1.121530)


Out[13]:
1.0590231347803503

In [14]:
std = var.map(np.sqrt)
std.head()


Out[14]:
0    0.625521
1    1.059023
2    0.895549
3    1.155476
4    1.016681
dtype: float64

skewness : $\frac{\mu_3}{\sigma^3} = \operatorname{E}\left[\left(\frac{X-\mu}{\sigma}\right)^3 \right]$


In [15]:
# (X - u)^3
sub3_df = sub_df.applymap(lambda x: x*x*x)
sub3_df.head()


Out[15]:
0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0
0 -49.503025 -31.908856 -19.072364 -10.243550 -4.672414 -1.608955 -0.303173 -0.005069 0.035357 0.568106
1 -18.062561 -9.579106 -4.281312 -1.419179 -0.242707 -0.001896 0.053253 0.672741 2.606569 6.604734
2 -44.181303 -27.961337 -16.294156 -8.429762 -3.618153 -1.109331 -0.153294 -0.000044 0.100421 0.898100
3 -11.952283 -5.700707 -2.128717 -0.486311 -0.023489 0.009747 0.363398 1.787465 5.031947 10.846844
4 -24.005733 -13.561789 -6.694938 -2.655181 -0.692517 -0.056946 0.001532 0.232916 1.387208 4.214406

In [16]:
# theta^3 
std3 = std.map(lambda x: x**3)
std3.head()


Out[16]:
0    0.244752
1    1.187726
2    0.718237
3    1.542703
4    1.050882
dtype: float64

In [17]:
# 
skew_df = sub3_df.apply(lambda x: x / std3)
skew_df.head()


Out[17]:
0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0
0 -202.257906 -130.372202 -77.925268 -41.852776 -19.090401 -6.573817 -1.238696 -0.020712 0.144460 2.321149
1 -15.207680 -8.065079 -3.604628 -1.194870 -0.204346 -0.001597 0.044836 0.566411 2.194587 5.560822
2 -61.513543 -38.930515 -22.686322 -11.736741 -5.037548 -1.544519 -0.213431 -0.000061 0.139816 1.250422
3 -7.747626 -3.695273 -1.379862 -0.315233 -0.015226 0.006318 0.235560 1.158658 3.261774 7.031066
4 -22.843405 -12.905144 -6.370778 -2.526621 -0.658986 -0.054189 0.001457 0.221639 1.320041 4.010350

In [18]:
skew = skew_df.mul(prob_df.ix[:,:'5']).sum(axis=1)
skew.head()


Out[18]:
0   -1.002260
1   -0.347552
2   -1.052706
3   -0.127879
4   -0.452095
dtype: float64

expect_df


In [19]:
expect_df = pd.DataFrame({'mean':mean,
             'std':std,})
expect_df['skew'] = skew
expect_df.head()


Out[19]:
mean std skew
0 4.171785 0.625521 -1.002260
1 3.123774 1.059023 -0.347552
2 4.035191 0.895549 -1.052706
3 2.786390 1.155476 -0.127879
4 3.384729 1.016681 -0.452095

In [20]:
expect_df.to_csv('../resource/preprocess_expectation_df.csv', index=False)